// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H

namespace Eigen {

/** \class TensorEvaluator
  * \ingroup CXX11_Tensor_Module
  *
  * \brief A cost model used to limit the number of threads used for evaluating
  * tensor expression.
  *
  */

// Class storing the cost of evaluating a tensor expression in terms of the
// estimated number of operand bytes loads, bytes stored, and compute cycles.
class TensorOpCost
{
public:
    // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
    // model based on minimal reciprocal throughput numbers from Intel or
    // Agner Fog's tables would be better than what is there now.
    template <typename ArgType> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()
    {
        return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType>>::Cost;
    }
    template <typename ArgType> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
    {
        return internal::functor_traits<internal::scalar_sum_op<ArgType>>::Cost;
    }
    template <typename ArgType> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
    {
        return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType>>::Cost;
    }
    template <typename ArgType> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
    {
        return internal::functor_traits<internal::scalar_mod_op<ArgType>>::Cost;
    }
    template <typename SrcType, typename TargetType> static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
    {
        return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType>>::Cost;
    }

    EIGEN_DEVICE_FUNC
    TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
    EIGEN_DEVICE_FUNC
    TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
        : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles)
    {
    }

    EIGEN_DEVICE_FUNC
    TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
        : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles)
    {
        eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
        eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
        eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
    }

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
    {
        return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
    }

    // Drop memory access component. Intended for cases when memory accesses are
    // sequential or are completely masked by computations.
    EIGEN_DEVICE_FUNC void dropMemoryCost()
    {
        bytes_loaded_ = 0;
        bytes_stored_ = 0;
    }

    // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const
    {
        double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
        double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
        double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
        return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
    }

    // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const
    {
        double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
        double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
        double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
        return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
    }

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs)
    {
        bytes_loaded_ += rhs.bytes_loaded();
        bytes_stored_ += rhs.bytes_stored();
        compute_cycles_ += rhs.compute_cycles();
        return *this;
    }

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs)
    {
        bytes_loaded_ *= rhs;
        bytes_stored_ *= rhs;
        compute_cycles_ *= rhs;
        return *this;
    }

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs)
    {
        lhs += rhs;
        return lhs;
    }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
    {
        lhs *= rhs;
        return lhs;
    }
    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
    {
        rhs *= lhs;
        return rhs;
    }

    friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc)
    {
        return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored() << ", compute_cycles = " << tc.compute_cycles() << "]";
    }

private:
    double bytes_loaded_;
    double bytes_stored_;
    double compute_cycles_;
};

// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
// in [1:max_threads] instead of just switching multi-threading off for small
// work units.
template <typename Device> class TensorCostModel
{
public:
    // Scaling from Eigen compute cost to device cycles.
    static const int kDeviceCyclesPerComputeCycle = 1;

    // Costs in device cycles.
    static const int kStartupCycles = 100000;
    static const int kPerThreadCycles = 100000;
    static const int kTaskSize = 40000;

    // Returns the number of threads in [1:max_threads] to use for
    // evaluating an expression with the given output size and cost per
    // coefficient.
    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff, int max_threads)
    {
        double cost = totalCost(output_size, cost_per_coeff);
        double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
        // Make sure we don't invoke undefined behavior when we convert to an int.
        threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
        return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
    }

    // taskSize assesses parallel task size.
    // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
    // granularity needs to be increased to mitigate parallelization overheads.
    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff)
    {
        return totalCost(output_size, cost_per_coeff) / kTaskSize;
    }

    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost& cost_per_coeff)
    {
        // Cost of memory fetches from L2 cache. 64 is typical cache line size.
        // 11 is L2 cache latency on Haswell.
        // We don't know whether data is in L1, L2 or L3. But we are most interested
        // in single-threaded computational time around 100us-10ms (smaller time
        // is too small for parallelization, larger time is not interesting
        // either because we are probably using all available threads already).
        // And for the target time range, L2 seems to be what matters. Data set
        // fitting into L1 is too small to take noticeable time. Data set fitting
        // only into L3 presumably will take more than 10ms to load and process.
        const double kLoadCycles = 1.0 / 64 * 11;
        const double kStoreCycles = 1.0 / 64 * 11;
        // Scaling from Eigen compute cost to device cycles.
        return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
    }
};

}  // namespace Eigen

#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
